##########################################################
##########################################################
###########				       ###########	
###########	       Chase Joyner            ###########
###########        Mthsc 885 Homework 3        ###########
###########          October 15, 2015          ###########
###########				       ###########
##########################################################
##########################################################


##########################################################
##########################################################
##########################################################
#######  Problem 1  ####### 

# Drug Data:
# Column 1: Identification code
# Column 2: Age at enrollment
# Column 3: Depression Score
# Column 4: Drug use histroy 1=Never, 2=Previous, 3=Recent
# Column 5: Number of prior drug treatments
# Column 6: Race 0=White and 1=Otherwise
# Column 7: Treatment randomization id 0=Short and 1=Long
# Column 8: Treatment site 0=Site A and 1=Site B
# Column 9: Response Variable -- Remained drug free for 12 months 1=Yes and 0=No

data = read.table(file.choose());

Y = data[,9];
x0 = rep(1,dim(data)[1]);
x1 = data[,2];
x2 = data[,3];
x3 = data[,4];
x4 = data[,5];
x5 = data[,6];
x6 = data[,7];
x7 = data[,8];

####### Add dummy variables for qualitative variables #######
h1 = as.numeric(x3==1); ## Never
h2 = as.numeric(x3==2); ## Previous -- Recent is baseline
r1 = as.numeric(x5==1); ## Otherwise -- White is baseline
t1 = as.numeric(x6==1); ## Long -- Short is baseline
s1 = as.numeric(x7==1); ## Site B -- Site A is baseline

####### Get an initial idea on where to start ######
logit = glm(Y ~ x1 + x2 + h1 + h2 + x4 + r1 + t1 + s1, family = binomial(link = logit));    ## AIC = 637.2
probit = glm(Y ~ x1 + x2 + h1 + h2 + x4 + r1 + t1 + s1, family = binomial(link = probit));  ## AIC = 637.4
summary(logit);
summary(probit);

####### Both links imply intercept, x1, h1, x4, and t1 are significant given rest are in the model              #######
####### Recall that we had qualitative variables, so we could consider collapsing h2 into the baseline (Recent) #######
####### Then we do stepAIC -- forward and backward to obtain starting models, then consider interactions        #######
logit = glm(Y ~ x1 + x2 + h1 + x4 + r1 + t1 + s1, family = binomial(link = logit));     ## AIC = 635.4
probit = glm(Y ~ x1 + x2 + h1 + x4 + r1 + t1 + s1, family = binomial(link = probit));   ## AIC = 635.7
summary(logit);
summary(probit);

step.logit = step(logit, direction = "both");
step.probit = step(probit, direction = "both");

####### Both links indicate the model with x1, h1, x4, and t1                #######
####### We will now just consider logit link from here, since smaller AICs   #######
####### Now we look at possible interaction terms                            #######
full = glm(Y ~ x1 + x2 + h1 + x4 + r1 + t1 + s1, family = binomial(link = logit));     ## AIC = 635.4
red = glm(Y ~ x1 + h1 + x4 + t1, family = binomial(link = logit));                     ## AIC = 631
anova(red, full, test = "Chisq");

####### Here we obtain a large p-value, indicating the reduced model is sufficient #######
####### Let's look at possible interaction terms                                   #######
model = glm(Y ~ (x1 + h1 + x4 + t1)^2, family = binomial(link = logit));               ## AIC = 631.9
summary(model);

####### We see that x1:x4 might be significant, let's add it to the reduced model #######
red2 = glm(Y ~ x1 + h1 + x4 + t1 + x1:x4, family = binomial(link = logit));  ## AIC = 628.7
summary(red2);

####### We see that red2 says x1 is insignificant now, so for parsimony, drop x1:x4, and keep x1 #######
####### Now let's consider higher order terms                                                    #######
x1.2 = x1^2;
x4.2 = x4^2;
red3 = glm(Y ~ x1 + h1 + x4 + t1 + x1.2, family = binomial(link = logit));
red4 = glm(Y ~ x1 + h1 + x4 + t1 + x4.2, family = binomial(link = logit));

####### Here we see that neither higher order term is significant ######
####### Conclude that red model is our model                      ######
red = glm(Y ~ x1 + h1 + x4 + t1, family = binomial(link = logit));
summary(red);

####### Now use R to find 3 top best models #######
library(bestglm)
tmp = data.frame("X" = cbind(x1, x2, h1, x4, r1, t1, s1), "Y" = Y);
res1 = bestglm(Xy = tmp, family = binomial, IC = "AIC", TopModels = 3, method = "exhaustive", intercept = TRUE);
res1$BestModels;

####### We see that the best model is our current reduced model        #######
####### Let these three be our candidate models, now we cross validate #######
model1 = glm(Y ~ x1 + h1 + x4 + t1, family = binomial(link = logit));
model2 = glm(Y ~ x1 + h1 + x4 + r1 + t1, family = binomial(link = logit));
model3 = glm(Y ~ x1 + h1 + x4 + t1 + s1, family = binomial(link = logit));

library(boot);
glm.diag.plots(model1, iden = TRUE); 

####### Residual plot is as expected since we fit logistic model.  Observation 471 is influential, #######
####### observations 350, 575, 19, 120, and 7 are influential and high leverage.                   ####### 
 
glm.diag.plots(model2, iden = TRUE);

####### Residual plot is again as expected.  Here, observation 350 became not high leverage. #######                                                         ######

glm.diag.plots(model3, iden = TRUE);

####### Residual plot is again as expected.  Here, it seems that observation 19 became not influential ######
####### and observation 350 became not high leverage.                                                  ######

####### Cross validate the three models #######
res1 = res2 = res3 = 0;
p = sample(1:575, 10);

for(i in 1:10){
	newY = Y[-p[i]];
	newx1 = x1[-p[i]];
	newh1 = h1[-p[i]];
	newx4 = x4[-p[i]];
	newt1 = t1[-p[i]];
	newr1 = r1[-p[i]];
	news1 = s1[-p[i]];
	X1 = c(1, x1[p[i]], h1[p[i]], x4[p[i]], t1[p[i]]);
	X2 = c(1, x1[p[i]], h1[p[i]], x4[p[i]], r1[p[i]], t1[p[i]]);
	X3 = c(1, x1[p[i]], h1[p[i]], x4[p[i]], t1[p[i]], s1[p[i]]);
	mod1 = glm(newY ~ newx1 + newh1 + newx4 + newt1, family = binomial(link = logit));
	mod2 = glm(newY ~ newx1 + newh1 + newx4 + newr1 + newt1, family = binomial(link = logit));
	mod3 = glm(newY ~ newx1 + newh1 + newx4 + newt1 + news1, family = binomial(link = logit));


	res1 = res1 + (Y[p[i]] - mod1$coefficients %*% X1)^2;
	res2 = res2 + (Y[p[i]] - mod2$coefficients %*% X2)^2;
	res3 = res3 + (Y[p[i]] - mod3$coefficients %*% X3)^2;
}

####### res1 - 22.14126, res2 = 23.6422, res3 = 22.56049.  So we see model1 has the smallest prediction error #######
####### and it is also the easiest to interpret model                                                         #######
####### So I choose model1 as my final model, which is defined as 					      #######
####### Y = -3.08509 + 0.053083 * x1 + 0.735059 * h1 - 0.066286 * x4 + 0.466711 * t1                          #######


##########################################################
##########################################################
##########################################################
#######  Problem 2  ####### 

# Data:
# Column 1: ID
# Column 2: Crab color
# Column 3: Spine Condition
# Column 4: Carapace width
# Column 5: Weight
# Column 6: Satellites

data = read.table(file.choose());

Y = data[,6];
x0 = rep(1,dim(data)[1]);
x1 = data[,2];
x2 = data[,3];
x3 = data[,4];
x4 = data[,5];

unique(x1); ## To check how many factors we need for colors

####### Create dummy variables #######
c1 = as.numeric(x1==1);
c2 = as.numeric(x1==2); 
c3 = as.numeric(x1==3);
s1 = as.numeric(x2==1);
s2 = as.numeric(x2==2); 

####### Intuitively, spine condition will not be significant predictor #######

####### Get an initial idea on where to start ######
log = glm(Y ~ x3 + x4 + c1 + c2 + c3 + s1 + s2, family = poisson(link = log));    ## AIC = 920.86
summary(log);

####### The spine condition is not significant with everything else in the model, further indicating #######
####### that this predictor variable might not be significant for this model                         #######
####### It appears that we should collapse some of the dummy variables, but in this situation 	     #######
####### it does not make sense to collapse any categories together.  Now we use forward and          #######
####### backward AIC.

step.log = step(log, direction = "both");

####### The step function using both directions indicated the predictors x4, c1, and c2 as reduced model #######
####### Note that s1 and s2 were actually dropped from the model                                         #######
full = log;
red = glm(Y ~ x4 + c1 + c2, family = poisson(link = log));
summary(red);                                                   ## AIC = 915.08
anova(red, full, test = "Chisq");

####### ANOVA gives P value 0.6959, indicating the reduced model is sufficient #######

model = glm(Y ~ (x4 + c1 + c2)^2, family = poisson(link = log));
summary(model);   ## AIC = 908.55

####### Note the interaction c1:c2 does not make sense, so let's fit the model without it #######
model = glm(Y ~ x4 + c1 + c2 + x4:c1 + x4:c2, family = poisson(link = log));
summary(model);  ## AIC = 908.55

####### Let's check for higher order terms #######
x4.2 = x4^2;
model = glm(Y ~ x4 + c1 + c2 + x4:c1 + x4:c2 + x4.2, family = poisson(link = log));
summary(model);   ## AIC = 902.55

####### Note that the AIC is reduced, but now c2 and x4:c2 become insignificant #######
####### Notice that c2 might be insignificant now because of x4:c2, so let's    #######
####### dropping the interaction term x4:c2                                     #######
model = glm(Y ~ x4 + c1 + c2 + x4:c1 + x4.2, family = poisson(link = log)); 
summary(model);   ## AIC = 901.83

####### Everything is significant and we have the lowest AIC out of the models so far #######
####### Now use bestglm function to obtain candidate models                             #######
library(bestglm);
x3.2 = x3^2;
tmp = data.frame("X" = cbind(x3, x4, c1, c2, c3, s1, s2, x3.2, x4.2), "Y" = Y);
res = bestglm(Xy = tmp, family = poisson, IC = "AIC", TopModels = 3, method = "exhaustive", intercept = TRUE);
res$BestModels;

####### The bestglm function indicated that x3 might be quadraticly related to Y, which the step function could not pick up #######
####### Notice that the crab spine condition is still not a significant predictor.  So the three models to consider are     #######
model1 = glm(Y ~ x3 + x4 + c1 + c2 + x3.2, family = poisson(link = log));        ## AIC = 901.078
model2 = glm(Y ~ x4 + c1 + c2 + x4:c1 + x4.2, family = poisson(link = log));     ## AIC = 901.83
model3 = glm(Y ~ x3 + x4 + c3 + x3.2, family = poisson(link = log));             ## AIC = 901.906

####### Diagnostics of these models #######
library(boot);
glm.diag.plots(model1, iden = TRUE);  ## Here there appears to be about 16 influential points using this model, and 16 high leverage points.
glm.diag.plots(model2, iden = TRUE);  ## Here about 15 influential points, and about 11 high leverage points.
glm.diag.plots(model3, iden = TRUE);  ## Here about 9 influential points, and about 11 high leverage points.

## Cross-validate

res1 = res2 = res3 = 0;
p = sample(1:173, 5);

x4.c1 = x4*c1;

for(i in 1:5){
	newY = Y[-p[i]];
	newc1 = c1[-p[i]];
	newc2 = c2[-p[i]];
	newc3 = c3[-p[i]];
	newx3 = x3[-p[i]];
	newx4 = x4[-p[i]];
	newx3.2 = x3.2[-p[i]];
	newx4.2 = x4.2[-p[i]];
	newx4.c1 = x4.c1[-p[i]];
	X1 = c(1, x3[p[i]], x4[p[i]], c1[p[i]], c2[p[i]], x3.2[p[i]]);
	X2 = c(1, x4[p[i]], c1[p[i]], c2[p[i]], x4.c1[p[i]], x4.2[p[i]]);
	X3 = c(1, x3[p[i]], x4[p[i]], c3[p[i]], x3.2[p[i]]);
	mod1 = glm(newY ~ newx3 + newx4 + newc1 + newc2 + newx3.2, family = poisson(link = log));
	mod2 = glm(newY ~ newx4 + newc1 + newc2 + newx4.c1 + newx4.2, family = poisson(link = log));
	mod3 = glm(newY ~ newx3 + newx4 + newc3 + newx3.2, family = poisson(link = log));


	res1 = res1 + (Y[p[i]] - mod1$coefficients %*% X1)^2;
	res2 = res2 + (Y[p[i]] - mod2$coefficients %*% X2)^2;
	res3 = res3 + (Y[p[i]] - mod3$coefficients %*% X3)^2;
}

## res1 = 2.59, res2 = 2.85, res3 = 3.06.  Model 1 also had the smallest AIC ##
## The final model is ##
## Y = -20.833522 + 1.494626 * x3 + 0.678886 * x4 + 0.378522 * c1 + 0.234012 * c2 - 0.027755 * x3^2 ##
 

##########################################################
##########################################################
##########################################################
#######  Problem 3  ####### 

# Data:
# Column 1: Sex
# Column 2: Age
# Column 3: Finish Time
# Column 4: Place

data = read.table(file.choose(), header = TRUE);

Y = data[,3];
x1 = data[,1];
x2 = data[,2];

N = length(Y);

## We have missing data, we just pull it out since it's a small portion of our data set ##
tmp = c(NA);
s = 1;

for(i in 1:N){
	if(x2[i] == 0){
		tmp[s] = i;
		s = s + 1;
	}
}

Y = Y[-tmp];
x1 = x1[-tmp];
x2 = x2[-tmp];

## Only 2 factors for sex, so need one dummy variable ##
s1 = as.numeric(x1 == 1);  ## Baseline is male

## Get an initial idea of where to start ##
## Believe data to be right-skewed, try Gamma with log and inverse link ##
log = glm(Y ~ s1 + x2, family = Gamma(link = log));
inverse = glm(Y ~ s1 + x2, family = Gamma(link = inverse));

summary(log);  # AIC = 67296, Sex significant, Age is not.  Against intuition.  Positive coeff on females, makes sense.
summary(inverse);  # AIC = 67296, Sex significant, Age is not.  Against intuition.  Negative coeff on females, doens't make sense.

## Now try step AIC ##
step.log = step(log, direction = "both");  ## Claims to leave Sex and remove Age, AIC = 67296
step.inverse = step(inverse, direction = "both");  ## Again, claims to leave Sex and remove Age, AIC = 67296

## It appears inverse gaussian has similar results, but smaller AIC.  So we proceed with this ##
## Check full versus reduced models ##
full = log;
red = glm(Y ~ s1, family = Gamma(link = log));
anova(red, full, test = "Chisq");

full = inverse;
red = glm(Y ~ s1, family = Gamma(link = inverse));
anova(red, full, test = "Chisq");

## Reduced models looks sufficient ##

## Now we look for higher-order terms and interactions ##
mod = glm(Y ~ (s1 + x2)^2, family = Gamma(link = inverse));
summary(mod);                                                      ## AIC = 67293, iteraction s1*x2 is significant

x2.2 = x2^2;
mod1 = glm(Y ~ s1 + x2 + x2.2, family = Gamma(link = inverse));
summary(mod1);                                                     ## AIC = 65574, x2^2 seems significant

## Check model with both of these included, then run step AIC ##
mod = glm(Y ~ s1 + x2 + x2.2 + s1:x2, family = Gamma(link = inverse));
summary(mod);                                                      ## AIC = 65569, all terms significant

step = step(mod, direction = "both");                              ## Doesn't remove anything, all terms significant.

## Run Bestglm ##
library(bestglm);
tmp = data.frame("X" = cbind(s1, x2, x2.2, s1*x2), "Y" = Y);
res = bestglm(Xy = tmp, family = Gamma(link = inverse), IC = "AIC", TopModels = 3, method = "exhaustive", intercept = TRUE);
res$BestModels;

## Best glm confirms that mod is a good model.  We will use these top 3 models to run diagnostics and cross-validate ##
## to choose best model. Actually, all of these models want to include age, this is true based on intuition also.    ##
s1.x2 = s1*x2;
model1 = glm(Y ~ s1 + x2 + x2.2 + s1.x2, family = Gamma(link = inverse));  ## AIC = 65569
model2 = glm(Y ~ s1 + x2 + x2.2, family = Gamma(link = inverse));          ## AIC = 65574
model3 = glm(Y ~ x2 + x2.2 + s1.x2, family = Gamma(link = inverse));       ## AIC = 65710


## Diagnostics on these models ##
library(boot);
glm.diag.plots(model1, iden = TRUE);
glm.diag.plots(model2, iden = TRUE);
glm.diag.plots(model3, iden = TRUE);

## The diagnostic plots for models 1 and 2 show a lot of observations with high influence and high leverage ##
## Because of this, I chose model 3 ##
## Now we cross-validate to ensure model 3 is reasonable. ##

## Cross-validate

res1 = res2 = res3 = 0;
p = sample(0:17000, 100);

for(i in 1:100){
	newY = Y[-p[i]];
	news1 = s1[-p[i]];
	newx2 = x2[-p[i]];
	newx2.2 = x2.2[-p[i]];
	news1.x2 = s1.x2[-p[i]];
	X1 = c(1, s1[p[i]], x2[p[i]], x2.2[p[i]], s1.x2[p[i]]);
	X2 = c(1, s1[p[i]], x2[p[i]], x2.2[p[i]]);
	X3 = c(1, x2[p[i]], x2.2[p[i]], s1.x2[p[i]]);

	mod1 = glm(newY ~ news1 + newx2 + newx2.2 + news1.x2, family = inverse.gaussian);
	mod2 = glm(newY ~ news1 + newx2 + newx2.2, family = inverse.gaussian);
	mod3 = glm(newY ~ newx2 + newx2.2 + news1.x2, family = inverse.gaussian);

	res1 = res1 + (Y[p[i]] - mod1$coefficients %*% X1)^2;
	res2 = res2 + (Y[p[i]] - mod2$coefficients %*% X2)^2;
	res3 = res3 + (Y[p[i]] - mod3$coefficients %*% X3)^2;
}